# North Korean elite network: 
#   R code for generating tables and graphics for
#   "Scraping public co-occurrences for statistical network analysis 
#    of political elites"
# 
# 
# Created by Paasha Mahdavi
# This version: 9 July 2017
#
#-----------------------------------------------------------------------------#
#    Loading packages and data                                                #
#-----------------------------------------------------------------------------#

rm(list = ls())

# Install required packages
pkg <- c("TeachingDemos","statnet","stargazer","miscTools","stringr","texreg","ggplot2",
         "gridExtra","dplyr")
inst <- pkg %in% installed.packages()
if (length(pkg[!inst]) > 0) install.packages(pkg[!inst])
rm(pkg, inst)

# Load required packages 
library("TeachingDemos")
library("statnet")
library("stargazer")
library("miscTools")
library("stringr")
library("texreg")
library("ggplot2")
library("gridExtra")
library("dplyr")



#-----------------------------------------------------------------------------#
#    Setting up code for generating log file                                  #
#-----------------------------------------------------------------------------#

txtStart(file = "northkorea-replication.log", commands = TRUE, results = TRUE)



#-----------------------------------------------------------------------------#
#    Loading and analyzing scraped data                                       #
#-----------------------------------------------------------------------------#

dprk <- read.csv("northkorea2012.csv",header = F)

# Re-naming and removing extra \" \"

names(dprk) <- c("name1","name2","hits")

dprk$name1 <- gsub('\"','',dprk$name1)
dprk$name2 <- gsub('\"','',dprk$name2)


# Network data

dprk.b <- dprk[1:820,c("name2","name1","hits")]
names(dprk.b) <- c("name1","name2","hits")
dprk2 <- data.frame(rbind(dprk,dprk.b))
dprk2 <- dprk2 %>% arrange(name2)
dprk2 <- dprk2 %>% arrange(name1)


# Making into network matrix (adjacency or sociomatrix)

nwk.s.df <- reshape(dprk2, v.names="hits", timevar="name2", direction="wide", idvar="name1")

nwk.s <- as.matrix(nwk.s.df[1:nrow(nwk.s.df),2:ncol(nwk.s.df)])

nwk <- network(nwk.s, 
               directed=FALSE, 
               matrix.type="a", 
               ignore.eval=FALSE, 
               names.eval="hits")

dprk3 <- as.matrix(nwk, 
                   attrname = "hits",
                   matrix.type = "edgelist")

nwk1 <- nwk
delete.edges(nwk1, seq_along(nwk1$mel))
nwk1[dprk3[,1:2], names.eval="hits", add.edges=TRUE] <- dprk2[,3]

k <- max(log(as.matrix(nwk1, attrname="hits")+1))
nwk.col <- matrix(gray(1-(log(as.matrix(nwk1, attrname="hits")+1)/k)),nrow=network.size(nwk1))


# Network plot
#plot(nwk1, edge.col=nwk.col, usecurve=TRUE, edge.curve=.01,displaylabels=FALSE, displayisolates=TRUE)


# Getting edgelist from network 

dprk.edge <- as.matrix(nwk1, 
                       attrname = "hits",
                       matrix.type = "edgelist")


# Centrality scores

cent <- data.frame(names = nwk.s.df$name1)
cent$evcent <- evcent(nwk1, ignore.eval=FALSE)
cent$between <- betweenness(nwk1, ignore.eval=FALSE)
cent$degrees <- degree(nwk1, ignore.eval=FALSE)



#-----------------------------------------------------------------------------#
#    Loading and analyzing data from Ishiyama (2014)                          #
#-----------------------------------------------------------------------------#

# Loading bipartite network excel file from Ishiyama 2014

ishiyama <- read.csv("ishiyama2014-kju2012.csv", header=T)
ish <- data.matrix(ishiyama[,2:54])
row.names(ish) <- ishiyama[,1]
ish.sm <- ish %*% t(ish)


# Making into network matrix (adjacency, aka sociomatrix)

nwk.ish <- network(ish.sm, 
                   directed=TRUE, 
                   matrix.type="a", 
                   ignore.eval=FALSE, 
                   names.eval="hits")


# Getting edgelist from network 

dprk.edge.ish <- as.matrix(nwk.ish, 
                           attrname = "hits",
                           matrix.type = "edgelist")


# Centrality scores
cent.ish <- data.frame(names = rownames(ish.sm))
cent.ish$evcent <- evcent(nwk.ish, ignore.eval=FALSE)
cent.ish$between <- betweenness(nwk.ish, ignore.eval=FALSE)
cent.ish$degrees <- degree(nwk.ish, ignore.eval=FALSE)



#-----------------------------------------------------------------------------#
#    Comparing network matrices                                               #
#-----------------------------------------------------------------------------#

# Scraped data

scraped.data <- dprk
names(scraped.data) <- c("name1","name2","gchits")


# Ishiyama data

ish.data <- as.data.frame(dprk.edge.ish)
ish.names <- as.data.frame(attr(dprk.edge.ish,"vnames"))
ish.names$order <- rownames(ish.names)

ish.data <- merge(ish.data,ish.names,by.x="V1",by.y="order")
ish.data <- merge(ish.data,ish.names,by.x="V2",by.y="order")

ish.data <- ish.data[,c(4,5,3)]
names(ish.data) <- c("name1","name2","visits")


# Combined data

combined <- merge(x = scraped.data, 
                  y = ish.data,
                  by = c("name1", "name2"), 
                  all.x = T, 
                  all.y = T)

combined <- merge(x = combined, 
                  y = ish.data, 
                  by.x = c("name1", "name2"),
                  by.y = c("name2", "name1"),
                  all.x = T, 
                  all.y = T)

combined <- merge(x = combined, 
                  y = scraped.data, 
                  by.x = c("name1", "name2"), 
                  by.y = c("name2", "name1"), 
                  all.x = T, 
                  all.y = T)

# Google (scraped) hits

combined$gchits <- ifelse(is.na(combined$gchits.x),
                          combined$gchits.y,
                          combined$gchits.x)

# Number of visits

combined$visits <- ifelse(is.na(combined$visits.x),
                          combined$visits.y,
                          combined$visits.x)


# Replacing NA with 0 (i.e., no google hits or visits)

combined$gchits <- ifelse(is.na(combined$gchits),
                          0,
                          combined$gchits)

combined$visits <- ifelse(is.na(combined$visits),
                          0,
                          combined$visits)


# Removing looped pairs (i.e., self-hits or "a-a" pairs)

dat <- combined[combined$name1!=combined$name2,]



#-----------------------------------------------------------------------------#
#    Appendix Figure 1                                                        #
#-----------------------------------------------------------------------------#

# Plotting co-occurrences from scraped vs. Ishiyama-collected
#
# <------ NOTE: points are jittered for visual clarity ------>
#
ggplot(data = dat, 
       aes(x = jitter(log(visits+1),3), y = jitter(log(gchits+1),10))) + 
  geom_point(alpha=0.5,col="lightgray") + 
  stat_smooth(method="lm",level = 0.99) + 
  labs(x = "\n Logged co-occurrences (Ishiyama)", 
       y="Logged co-occurrences (Scraped) \n ") + 
  theme_bw()

ggsave(filename = "AppendixFigure1.pdf", width = 8.5, height = 7, units = "in")




#-----------------------------------------------------------------------------#
#    Appendix Figure 2                                                        #
#-----------------------------------------------------------------------------#

# Function for 95% intervals for plotting means

std <- function(x) sd(x)/sqrt(length(x))
means <- summarize(group_by(dat,visits), 
                   med = median(gchits), 
                   std = 1.96*std(gchits))


ggplot(data = dat[dat$visits<9,]) + 
  geom_point(aes(x = jitter(visits), 
                 y = jitter(gchits)), 
             alpha=0.5, 
             col="lightgray") + 
  geom_crossbar(data = means[means$visits<9,], 
                aes(x = visits, 
                    y = med, 
                    ymin = med - std, 
                    ymax = med + std, 
                    group = factor(visits)), 
                width=0.25) + 
  labs(x = "Co-occurrences (Ishiyama)", 
       y="Co-occurrences (Scraped) \n ") + 
  coord_cartesian(ylim=c(0,20)) + 
  theme_bw()

ggsave(filename = "AppendixFigure2.pdf", width = 8.5, height = 7, units = "in")



#-----------------------------------------------------------------------------#
#    Statistical summaries in appendix text, p.1                              #
#-----------------------------------------------------------------------------#

cat("### --- Statistical summaries reported in Appendix page 1 --- ###")

# Pearson correlation

with(dat,cor(visits,gchits))
#  0.5830678


# OLS regression (log-log)

summary( lm( log(gchits+1) ~ log(visits+1), data = dat ) )
# coeffcient of 0.93 (SE = 0.03, t = 34.65)


# OLS regression (no transformations)

summary( lm( gchits ~ visits, data = dat ) )
# coeffcient of 1.83 (SE = 0.06, t = 29.05)



#-----------------------------------------------------------------------------#
#    Stop generating log file                                                 #
#-----------------------------------------------------------------------------#

txtStop()
